### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2
options(stringsAsFactors=FALSE,
dplyr.summarise.inform=FALSE,
future.globals.maxSize=min(memInKB, 2 * 1024^3),
mc.cores=min(cpus,1),
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R",
"functions_biomart.R",
"functions_degs.R",
"functions_io.R",
"functions_plotting.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
param$debugging_mode = "default_debugging"
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())### Rmarkdown configuration
################################################################################
### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
### R Options
options(citation_format="pandoc",
knitr.table.format="html",
kableExtra_view_html=TRUE)
### Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source='fold-hide', # by default collapse code blocks
dev=c('png', 'svg', 'tiff'), # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
dev.args=list(png = list(type="cairo"), # png: use cairo - works on cluster
tiff = list(type="cairo", compression = 'zip')), # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’
dpi=300, # figure resolution
fig.retina=2, # retina multiplier
fig.path=paste0(file.path(param$path_out, "figures"), "/") # Path for figures in png and pdf format (trailing "/" is needed)
)
### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
### Required libraries
library(magrittr)
### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
rcug_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","rcug_references.bib"))
invisible(knitcitations::citep(rcug_ref))
### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)
library(clustree)
library(ggraph)Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.
## Read gene annotation
# We read gene annotation from file.
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
### Set reference
################################################################################
if (param$species=="human") {
param$mart_dataset="hsapiens_gene_ensembl"
param$mt = "^MT-"
param$enrichr_dbs = c("GO_Biological_Process_2018", "WikiPathways_2019_Human", "KEGG_2021_Human", "Azimuth_Cell_Types_2021")
} else {
if (param$species=="mouse") {
param$mart_dataset="mmusculus_gene_ensembl"
param$mt = "^mt-"
param$enrichr_dbs = c("GO_Biological_Process_2018", "WikiPathways_2019_Mouse", "KEGG_2019_Mouse", "Azimuth_Cell_Types_2021")
} else {
param$mart_dataset=param$mart_dataset
}
}
if (is.null(param$annot_version)) {
param$annot_version=98
} else {
param$annot_version=param$annot_version
}
param$path_reference=file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference=paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = file.path(param$path_reference, param$reference)
param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")
### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}
### read_ensembl_annotation
################################################################################
# Load from file
annot_ensembl = read.delim(param$file_annot)
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main))
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]
### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest
# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.
Here, for the project Testdata, the following data are analysed:
### Read rds object
################################################################################
### Load Seurat S4 objects
# Test if file is defined
if (is.null(param$data)) {
message("Dataset is not specified")
} else {
# Test if file exists
if (file.exists(param$data)) {
# Read object
message(paste0("Load dataset:", param$data))
sc = base::readRDS(param$data)
# Transfer original params to loaded object
if ("parameters" %in% list_names(sc@misc[])) {
orig_param = sc@misc$parameters
param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
param = modifyList(x = param, val = orig_param)
param = modifyList(x = param, val = param_keep)
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(sc@meta.data[["orig.ident"]])) {
if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples))
sc = sample_colours_out[[1]]
param$col_samples = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(sc@meta.data[["seurat_clusters"]])) {
if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters))
sc = cluster_colours_out[[1]]
param$col_clusters = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(sc@meta.data[["annotation"]])) {
if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation))
sc = annotation_colours_out[[1]]
param$col_annotation = annotation_colours_out[[2]]
}
}
sc
} else {
message("Dataset does not exist")
}
}
### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
# Test if file exists
if (file.exists(param$refdata)) {
# Read object
message(paste0("Load dataset:", param$refdata))
scR = base::readRDS(param$refdata)
# Transfer original params to loaded object
if ("parameters" %in% list_names(scR@misc[])) {
orig_paramR = scR@misc$parameters
if (!is.null(orig_paramR$col_samples)) {
param$col_samples_ref = orig_paramR$col_samples
}
if (!is.null(orig_paramR$col_clusters)) {
param$col_clusters_ref = orig_paramR$col_clusters
}
if (!is.null(orig_paramR$col_annotation)) {
param$col_annotation_ref = orig_paramR$col_annotation
}
param = modifyList(x = param, val = param_advset)
}
### Set colors
# Set sample colors based on orig.ident
if (!is.null(scR@meta.data[["orig.ident"]])) {
if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples))
scR = sample_colours_out[[1]]
param$col_samples_ref = sample_colours_out[[2]]
}
}
# Set colors for clusters based on seurat_clusters
if (!is.null(scR@meta.data[["seurat_clusters"]])) {
if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters))
scR = cluster_colours_out[[1]]
param$col_clusters_ref = cluster_colours_out[[2]]
}
}
# Set colors for cell types based on annotation
if (!is.null(scR@meta.data[["annotation"]])) {
if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation))
scR = annotation_colours_out[[1]]
param$col_annotation_ref = annotation_colours_out[[2]]
}
}
scR
} else {
message("Reference dataset does not exist")
}
} # Transfer object into a list
sc_original = sc
sc = list()
n = param$project_id
sc[[n]] = sc_original# Download test dataset
param$path_test_dataset=paste0(param$path_to_git, "/modules/download_test_datasets/", param$download_test_datasets, ".R")
if (file.exists(param$path_test_dataset)) {
message(paste0("Using test dataset '", gsub('download_','', param$download_test_datasets), "'."))
# Data output in a data subfolder of the directory where it is run
# Create output directories
if (!file.exists("data")) dir.create("data", recursive=TRUE, showWarnings=FALSE)
setwd(file.path(param$path_to_git,"data"))
source(param$path_test_dataset)
param$path_data = data.frame(name=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
full.names = FALSE, recursive = FALSE),
type=c("10x"),
path=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
full.names = TRUE, recursive = FALSE))
} else {
message("Test dataset does not exist.")
}# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i, "name"]
type = datasets[i, "type"]
path = datasets[i, "path"]
suffix = datasets[i, "suffix"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
sc[[i]][["orig.ident"]] = sample_names[i]
}
# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
nms = unique(as.character(s[[]][["orig.ident"]]))
return(nms)
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
message("Read scRNA-seq data into Seurat object")
sc## $sample1
## An object of class Seurat
## 8000 features across 500 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
##
## $sample2
## An object of class Seurat
## 8000 features across 700 samples within 1 assay
## Active assay: RNA (8000 features, 0 variable features)
## 1 layer present: counts
### Calculate percentages of mitochondrial and ribosomal genes as well as ERCC (qc_calculate_cells)
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
if (!("percent_mt" %in% colnames(s@meta.data))) {
mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
} else {
return(s)
}
})
# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
if (!("percent_ribo" %in% colnames(s@meta.data))) {
ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
} else {
return(s)
}
})
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())
# Names of all available QC metrics
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
### Expand filter thresholds to all samples (qc_criteria_create)
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})
# List of all filter thresholds per QC metrics
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
if (m %in% names(param$cell_filter[[n]])) {
t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>%
tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
dplyr::filter(!is.na(value))
t$threshold = factor(t$threshold, levels=c("min", "max"))
return(t)
}
})
})
### Calculate diverse feature caracteristics (qc_calculate_features)
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
# Calculate percentage of counts per gene in a cell
counts_rna = Seurat::GetAssayData(sc[[n]], layer="counts", assay=param$assay_raw)
total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
# Calculate feature filters
num_cells_expr = Matrix::rowSums(counts_rna >= 1)
num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
# Calculate median of counts_rna_perc per gene
counts_median = apply(counts_rna_perc, 1, median)
# Add all QC measures as metadata
sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
return(sc[[n]])
})Quality control (QC) is an important step of the pre-processing
workflow. We start the analysis by removing unwanted cells from the
dataset(s).
These commonly used QC metrics, namely the number of unique genes
detected in each cell (“nFeature_”), the total number of molecules
detected in each cell (“nCount_”), and the percentage of counts that map
to the mitochrondrial genome (“percent_mt”), can be used to infer filter
thresholds. If ERCC spike-in controls were used, the percentage of
counts mapping to them is also shown (“percent_ercc”).
In cell QC these covariates are filtered via thresholding as they are a good indicator of cell viability. However, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals.
r knitcitations::citet("https://doi.org/10.1093/gigascience/giaa151")
as ambient RNA can confound the number of observed counts and adds
background noise to the data.Cells and genes are filtered based on the following thresholds:
cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))
# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
purrr::exec("rbind", !!!cell_filter_lst[is_numeric_filter]) %>%
knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}| Min | Max | |
|---|---|---|
| sample1.nFeature_RNA | 500 | 2400 |
| sample1.percent_mt | NA | 20 |
| sample2.nFeature_RNA | 500 | 2400 |
| sample2.percent_mt | NA | 20 |
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::exec("rbind", !!!cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}| Values | |
|---|---|
| sample1.nCount_RNA | NA,NA |
| sample2.nCount_RNA | NA,NA |
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
purrr::exec("rbind", !!!feature_filter_lst) %>%
knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}| Value | |
|---|---|
| sample1.min_counts | 1 |
| sample1.min_cells | 5 |
| sample2.min_counts | 1 |
| sample2.min_cells | 5 |
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
return(Cells(sc[[n]])[idx_exclude])
}
})
# Samples to drop
if (n %in% param$samples_to_drop) {
filter_result[["samples_to_drop"]] = colnames(sc[[n]])
} else {
filter_result[["samples_to_drop"]] = as.character(c())
}
# Minimum number of cells for a sample to keep
if (ncol(sc[[n]]) < param$samples_min_cells) {
filter_result[["samples_min_cells"]] = colnames(sc[[n]])
} else {
filter_result[["samples_min_cells"]] = as.character(c())
}
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s, length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
sc_cells_to_exclude_summary$Original = purrr::map_int(sc, ncol)
sc_cells_to_exclude_summary$Excluded = purrr::map_int(sc_cells_to_exclude, function(s) { return(purrr::flatten(s) %>% unique() %>% length())})
sc_cells_to_exclude_summary$PercKept = round((sc_cells_to_exclude_summary$Original - sc_cells_to_exclude_summary$Excluded) / sc_cells_to_exclude_summary$Original * 100, 2)
knitr::kable(sc_cells_to_exclude_summary,
align="l",
caption="Summary of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| nFeature_RNA | nCount_RNA | percent_mt | samples_to_drop | samples_min_cells | Original | Excluded | PercKept | |
|---|---|---|---|---|---|---|---|---|
| sample1 | 54 | 0 | 46 | 0 | 0 | 500 | 66 | 86.80 |
| sample2 | 73 | 0 | 58 | 0 | 0 | 700 | 79 | 88.71 |
# Add filter column to sc_cell_metadata for post-filtering QC
sc_cell_metadata$IS_FILTERED = rownames(sc_cell_metadata) %in% unlist(sc_cells_to_exclude)
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if (length(sc)==1) param$integrate_samples[["method"]] = "single"# Only RNA assay at the moment
# Iterate over samples and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
# Make sure the sample contains more cells than the minimum threshold
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
# Return gene names that do not pass the minimum threshold
else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
sc_features_to_exclude$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)
# Summarise
sc_features_to_exclude_summary = purrr::map_dfr(names(sc), function(n){
df = data.frame(Original=nrow(sc[[n]]),
FailThreshold=length(sc_features_to_exclude[[n]]))
df$PercFailThreshold = round(df$FailThreshold / df$Original * 100, 2)
df$Kept = length(setdiff(rownames(sc[[n]]), sc_features_to_exclude[["AllSamples"]]))
df$PercKept = round(df$Kept / df$Original * 100, 2)
return(df)
})
rownames(sc_features_to_exclude_summary) = names(sc)
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Summary of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Original | FailThreshold | PercFailThreshold | Kept | PercKept | |
|---|---|---|---|---|---|
| sample1 | 8000 | 1915 | 23.94 | 6439 | 80.49 |
| sample2 | 8000 | 1686 | 21.08 | 6439 | 80.49 |
# Now filter those genes that are to be filtered for all samples
sc = purrr::map(list_names(sc), function(n) {
assay_names = Seurat::Assays(sc[[n]])
features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
features = rownames(sc[[n]][[a]])
keep = features[!features %in% sc_features_to_exclude$AllSamples]
return(keep)
})
return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})After filtering, the size of the Seurat object is:
sc## $sample1
## An object of class Seurat
## 6439 features across 434 samples within 1 assay
## Active assay: RNA (6439 features, 0 variable features)
## 1 layer present: counts
##
## $sample2
## An object of class Seurat
## 6439 features across 621 samples within 1 assay
## Active assay: RNA (6439 features, 0 variable features)
## 1 layer present: counts
# Create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]], group=.data[["orig.ident"]])) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[m]] = p_list[[m]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED, orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]]), shape=21, size=2)
# Now add style
p_list[[m]] = p_list[[m]] +
AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
if (nrow(cell_qc_thresholds[[m]]) > 0) {
sample_names = sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED) %>% dplyr::pull(orig.ident) %>% unique()
p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]] %>% dplyr::filter(orig.ident %in% sample_names),
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value, lty=threshold), colour="firebrick") +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))
# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]], alpha=.data[["IS_FILTERED"]])) +
geom_point(size = param$pt_size) +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
scale_alpha_manual(values=setNames(c(1, 0.1), c(FALSE, TRUE))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
}
# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list) == 1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p# downsample_cells_n overwrites downsample_cells_equally
if (!is.null(param$downsample_cells_n)) {
n = param$downsample_cells_n
} else if (param$downsample_cells_equally) {
n = purrr::map_int(sc, ncol) %>% min()
}
# Actual downsampling
if (!is.null(param$downsample_cells_n) | param$downsample_cells_equally) {
sc = purrr::map(sc, function(s) {
cells = ScSampleCells(sc=s, n=n, seed=1)
return(subset(s, cells=cells))
})
# Adjust combined metadata accordingly
sc_cell_metadata = sc_cell_metadata[unlist(purrr::map(sc, Cells)), ]
message(paste0("Your data has been down-sampled to ", param$downsample_cells_n, " cells." ))
print(sc)
}# Remove filtered cells from metadata
sc_cell_metadata = sc_cell_metadata %>% dplyr::filter(IS_FILTERED==FALSE)
# Update levels but make sure level order stays the same
idx.factors = sapply(sc_cell_metadata, is.factor) %>% which()
for (n in colnames(sc_cell_metadata[idx.factors])) {
levels_old = sc_cell_metadata %>% dplyr::pull(n) %>% levels()
levels_new = sc_cell_metadata %>% dplyr::pull(n) %>% as.character() %>% unique()
sc_cell_metadata[[n]] = factor(sc_cell_metadata[[n]], levels=levels_old[levels_old %in% levels_new])
}
# Update actual colors as well, as they will appear in the plots otherwise
param$col_samples = param$col_samples[names(param$col_samples) %in% names(sc)]In the following sections, we subsequently run a series of Seurat functions for each provided sample. We normalize, select variable genes, and combine the datasets according to the chosen methods. In the process cell cycle scores are assigned to each cell based on its normalized expression of G2/M and S phase markers and data are scaled while regressing out covariants if desired.
Dependent on the normalisation of your choice, we either:
The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed by filtering, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal. Hence, normalization of the raw counts accounts for differences in sequencing depth per cell.
The standard log normalisation, a count depth scaling, is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor”, which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow). Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.r Cite("10.1186/s13059-019-1874-1")).
SCTransform is a new statistical approach for the
modelling, normalization and variance stabilization of single-cell
RNA-seq data. SCTransform models the UMI counts (via a
regularized negative binomial model) to remove the variation due to
sequencing depth and adjusts the variance based on pooling information
across genes with similar abundances and automatically regresses out
sequencing depth (nCounts). Hence, SCTransform can be
applied to 10x (contains UMI counts) but not SmartSeq-2 data. Additional
unwanted sources of variations can be regressed out during
SCTransform.
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# Normalize data the original way
# This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
if (!("data" %in% sc[[param$project_id]][["RNA"]][])) {
source(file.path(param$path_to_git,'modules/pre-processing/normalization.R'))
}message(paste0("Here, the ", ifelse(param$norm=="RNA","Standard log normalization","SCTransform")," method was used and no additional sources of variance regressed out."))×
(Message)
Here, the Standard log normalization
method was used and no additional sources of variance regressed out.
r Cite("10.1186/s13059-019-1874-1")). This method first
models the technical variability as a relationship between mean gene
expression and variance using local polynomial regression. The model is
then used to calculate the expected variance based on the observed mean
gene expression. The difference between the observed and expected
variance is called residual variance and likely reflects biological
variability.Whether samples should be combined via merging or integrating is very context dependent. It is advisable first performing a dimensionaly reduction on the merged dataset and only proceed to integration if common cell types separate due to batch effect.
Merging LogNormalized data (1a)
LogNormalized data are merged based according to https://satijalab.org/seurat/archive/v4.3/merge. Merging
will combine the Seurat objects based on the raw count matrices and also
merge the normalized data matrices. Any previously scaled data matrices
are erased. As the results of this layer dependent on the expression
across all cells in an object, the previous values from one object
before merging with another are no longer valied. Therefore, the layer
is removed and the scaling needs to be re-run after merging objects.
Merging SCTransform data (2a, 2c)
The goal of SCTransform is to perform normalization within an experiment
by learning the model of technical noise (within the experiment).
Running SCTransform standardizes the expression of each gene in each
cell relative to all the other cells in the input matrix.
2a) If you have multiple samples that are technically
noisy and you want to remove batch effects that are characterized by
simple shifts in mean expression, it is recommend to run SCTransform on
each sample individually. However, the samples should have roughly the
same celltype compositions. The merged object has then multiple SCT
models and the genes in the scale.data are the intersected genes.
2c) However, scale.data calculated based on multiple
different SCTransform models are not comparable. Performing SCTransform
separately on very diverging samples results in loss of the baseline
differences between them (similar to a gene-wise scaling before
merging). Hence, if samples are biologically heterogeneous or under
different treatments and, therefore, the SCTransform models of each
separate sample very diverging, SCTranform should be (re-)run on merged
samples. This way, SCTransform will learn one model using the median
sequencing depth of the entire dataset to calculate the corrected
counts.
Integration (1b, 2b, 2d)
As integration uses the shared highly variable genes, prior to
integration LogNormalize amd identification of variable genes or
SCTansform is required. Depending on the integration methode,
dementional reduction of the data is needed additionally (e.g. for
reciprocal PCA).
This workflow implements the following integration methods offered by
Seurat offers:
- Anchor-based CCA integration (method=CCAIntegration)
- Anchor-based RPCA integration (method=RPCAIntegration)
Anchor-based integration can be applied to either log-normalized or SCTransform-normalized datasets and can be also performed as reference-based integration where one or a subset of the datasets are listed as a ‘reference’. Reference-based integration, reduces computational time.
CCA-based integration enables indentification of conserved cell types across datasets and integrative analysis when gene expression is shifted e.g. due to experimental conditions, disease states, or even species affiliation. If cell types are present in only one dataset, the cells will appear as a separate sample-specific cluster.
Integration comprises the following steps:if (length(sc) == 1) {
# Default assay is set automatically
sc = sc[[1]]
message("Your dataset contains 1 sample only. No merging/integrating required.")
} else {
source(file.path(param$path_to_git,'modules/pre-processing/combine_samples.R'))
}×
(Message)
Data values for all samples have been
merged. This means that data values have been concatenated, not
integrated.
## An object of class Seurat
## 6439 features across 1055 samples within 1 assay
## Active assay: RNA (6439 features, 3000 variable features)
## 3 layers present: data, counts, scale.data
## 1 dimensional reduction calculated: pca
# Only relevant if data loaded from rds, otherwise they would be scaled and dimensional reduced before
if (!("scale.data" %in% sc[["RNA"]][])) {
# Scale (default)
all_genes = rownames(sc)
sc = suppressMessages(ScaleData(sc, features = all_genes))
}
# Run pca with the variable genes
if (!("pca" %in% list_names(sc@reductions[]))) {
# Run PCA for default normalization
sc = suppressMessages(Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc))))
}Next, we estimate the impact of covariates, such as the percentage of mitochondrial gene expression and cell cycle phase, to evaluate reliability of the applied normalization and scaling method, and determine possibly variables to regress out and integration method.
There can be sources of uninteresting variation in the data that is often specific to the dataset. Hence, checking and removing unwanted variation is another important step in pre-processing so that those artifacts do not drive clustering in the downstream analysis.
Variables provided in vars.to.regress are individually regressed against each feature, and the resulting residuals are then scaled and centered. This step allows controling for factors that may bias clustering.
Count depth effect
Although normalization scales count data to render gene counts
comparable between cells, a count depth effect often remains in the data
as no scaling method can infer the expression values of genes that were
not detected. This count depth effect can be both a biological and a
technical artefact (due to poor sampling).
SCTransform regresses out variation due to the number
of UMIs by default.
Cell cycle score
Cell cycle phase may be a source of significant variation in the data.
It is essential to check, how much the gene expression profiles in the
data reflect the cell cycle phases the single cells were in.
Note that removing all signal associated to cell cycle can negatively impact downstream analysis. Cell cycle signals can be informative of the biology. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. Moreover, biological signals must be understood in context. Dependencies exist between diffferent cellular processes within the organism. Thus, correcting for one process may unintentionally mask the signal of another. Vice versa, due to a correlation of cell size and some transcriptomic effect attributed to the cell cycle (McDavid et al, 2016), correcting for cell size via normalization, also partially corrects for cell cycle effects in scRNA‐seq data.
An alternative approach is to remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences in cell cycle phase amongst proliferating cells (which are often uninteresting), will be removed. For a more detailed explanation, see the cell cycle vignette for Seuratr Cite("https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html").
r Cite("10.1126/science.aad0501"), and human gene symbols
are translated to gene symbols of the species of interest using
biomaRt.message(
paste0("Here, ",
ifelse(is.null(param$vars_to_regress),"no additional sources of variance regressed out.",
paste0(param$vars_to_regress, " as sources of variance was/were regressed out. ")),
ifelse(param$cc_remove==FALSE,"No cell cycle effects removed for this report.",
paste0(
ifelse(param$cc_remove_all==TRUE,"All cell cycle effects removed for this report.",
"Difference in cell cycle state removed for this report.")
))
)
)×
(Message)
Here, no additional sources of variance
regressed out.No cell cycle effects removed for this report.
A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.
We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.
Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.
In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.Running a PCA on our object, we see how the variance can be explained.
p_list = Seurat::VizDimLoadings(sc, dims=1:12, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(xlab = paste0("PC ",i))
p = patchwork::wrap_plots(p_list, ncol=4) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pPCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(30, ncol(sc))) +
geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) +
AddStyle(title="Elbow plot")
p# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))message(paste0("For the current dataset ", param$pc_n," PCs were chosen."))×
(Message)
For the current dataset 20 PCs were
chosen.
We visualize the sample distribution in low dimensional space to confirm that the chosen method for combining the samples was appropriate (see section Combine datasets). The following plots offer a low dimension representation of your data combined via dataset merging.
# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))### Score plots
# PCA score plots colored by sample
p1 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) +
AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p2 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(3,4)) +
AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")
# Umap plot
p3 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", cols = param$col_samples) +
AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom", xlab = "UMAP 1", ylab = "UMAP 2")
p = p3 + (p1 / p2) +
plot_layout(widths = c(3, 1))
pThe clustering method used by Seurat first constructs a graph
structure, where nodes are cells and edges are drawn between cells with
similar gene expression patterns. Technically speaking, Seurat first
constructs a K-nearest neighbor (KNN) graph based on Euclidean distance
in PCA space, and refines edge weights between cells based on the shared
overlap in their local neighborhoods (Jaccard similarity). To partition
the graph into highly interconnected parts, cells are iteratively
grouped together using the Leiden algorithm
r Cite("10.1038/s41598-019-41695-z").
At this point, we would like to define subpopulations of cells with similar gene expression profiles using unsupervised clustering. Clusters ultimately serve as approximations for biological objects like cell types or cell states.
During the first step of clustering, a K-nearest neighbor (KNN) graph is constructed. In simplified terms this means that cells are connected to their K nearest neighbors based on cell-to-cell expression similarity using the PCs chosen in the previous step. The higher the similarity is, the higher the edge weight becomes. During the second step, the graph is partitioned into highly interconnected communities, whereby each community represents a cluster of cells with similar expression profiles. The separation of the graph into clusters is dependent on the chosen resolution. For scRNA-seq datasets of around 3000 cells, it is recommended to use a resolution value between 0.4 and 1.2. This value can be set even higher for larger datasets. Note that the choice of PCs and cluster resolution is an arbitrary one. Therefore, it is highly recommended to evaluate clusters and re-run the workflow with adapted parameters if needed.To get a first idea about how different cluster resolution values influence the clustering, we run and visualize the clustering multiple times.
tree_resolutions = seq(from = 0, to = 1.2, by = 0.2)
cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test, tree_resolutions)))
# Construct phylogenetic tree relating the "average" cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
sc = BuildClusterTree(sc, features=rownames(sc), verbose=FALSE)
Seurat::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "BuildClusterTree"))
}
# The number of clusters can be optimized by tuning "resolution" -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE, k.param=param$cluster_k)
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
# But we can run multiple resolutions if requested
sc = Seurat::FindClusters(sc, algorithm=4, method="igraph", resolution=cluster_resolutions, verbose=FALSE)
# Construct phylogenetic tree relating the "average" cell from each cluster for each clustering
# Also add colour lists for each clustering
for(r in cluster_resolutions) {
n = paste0(DefaultAssay(sc), "_snn_res.", r)
# Tree
if (length(levels(sc[[n, drop=TRUE]])) > 1) {
Seurat::Idents(sc) = n
sc = suppressWarnings(BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
l = list(Seurat::Tool(sc, "BuildClusterTree"))
names(l) = n
suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), l)})
}
col = GenerateColours(num_colours=length(levels(sc[[n, drop=TRUE]])), names=levels(sc[[n, drop=TRUE]]),
palette=param$col_palette_clusters, alphas=1)
# Colours
l = list(col)
names(l) = n
sc = ScAddLists(sc, lists=l, lists_slot="colour_lists")
}Clustering is used to group similar samples. One problem when conducting cluster analysis is deciding for a number of clusters, usually controlled by a parameter such as a numerical value for the resolution or a k integer for k-means clustering. A clustering tree visualises the relationships between cluster at a range of resolutions and can aid in the desicion.
A clustering tree shows how cells move as the clustering resolution is increased. Each level of the tree represents a different resolution. Each cluster forms a node in the tree. The edges are a representation of the number of cells from a lower resolution that contribute to a sepcific cluster at the next higher resolution. Thus, the clustering tree indicates how clusters are related to each other, which clusters are distinct and do not change with the increasing resolution, and which are unstable. E.g. we can see how one cluster is subdevided into others. Hence, the clustering tree can help to identify which clusters might be candidates for merging. On the other hand, nodes with multiple incoming edges would indicate that we might have over-clustered the data.
Here, we use the clustertree package Zappia and Oshlack (2018). While the nodes in the first graph are colored by the cluster resolution, the second plot is colored by SC3 stability index, a measures of cluster stability across resolutions.
p1 = clustree::clustree(sc, prefix = "RNA_snn_res.", layout = "sugiyama") +
scale_color_brewer(palette = "BrBG") +
scale_edge_color_continuous(low = "black", high = "gold")
p2 = clustree::clustree(sc, prefix = "RNA_snn_res.", node_colour = "sc3_stability", layout = "sugiyama")
p = p1 + p2
p
Overlay of clustering tree onto umap projection.
# Generate table with umap projection and resolutions
tbl_projection = as.data.frame(sc[["umap"]]@cell.embeddings[,1:2])
tbl_resolutions = as.data.frame(sc@meta.data %>% dplyr::select(contains("res.")))
tbl = cbind(tbl_projection, tbl_resolutions)
# Plot overlay of cluster tree with umap
overlay_list = clustree::clustree_overlay(tbl, prefix = "RNA_snn_res.", x_value = "umap_1", y_value = "umap_2", plot_sides = TRUE, use_colour = "points", alt_colour = "black")
tree_color = sc@misc$colour_lists$RNA_snn_res.1.2
p1 = overlay_list$overlay +
scale_color_manual(values = tree_color) +
scale_fill_brewer(palette = "BrBG", direction = -1) +
AddStyle(legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2") +
patchwork::plot_annotation(title="Top view")
p1p2 = overlay_list$x_side +
scale_color_manual(values = tree_color) +
scale_fill_brewer(palette = "BrBG", direction = -1) +
AddStyle(legend_position="none", xlab = "UMAP 1")
p3 = overlay_list$y_side +
scale_color_manual(values = tree_color) +
scale_fill_brewer(palette = "BrBG", direction = -1) +
AddStyle(legend_position="none", xlab = "UMAP 2")
p = p2 + p3 +
patchwork::plot_annotation(title="Side views")
pWe use a UMAP to visualise and explore the dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see
r Cite("https://pair-code.github.io/understanding-umap/").
Here we plot the UMAP projection colored by clusters obtained using different resolutions.
# Define height of test clusters
height_per_row = 3
nr_cols = 3
nr_rows = ceiling(length(cluster_resolutions)/nr_cols)
fig_height_test_clusters = nr_rows * height_per_rowtree_resolutions = tree_resolutions[3:7]
cluster_resolutions = sort(unique(c(param$cluster_resolution, param$cluster_resolution_test, tree_resolutions)))
p_list = list()
for(r in cluster_resolutions) {
r = as.character(r)
n = paste0(DefaultAssay(sc), "_snn_res.", r)
cluster_cells = table(sc[[n, drop=TRUE]])
cluster_labels = paste0(names(cluster_cells)," (", cluster_cells,")")
p_list[[r]] = Seurat::DimPlot(sc, reduction="umap", group.by=n, pt.size=param$pt_size, label=TRUE ) +
scale_color_manual(values=Seurat::Misc(sc, "colour_lists")[[n]], labels=cluster_labels) +
AddStyle(title=r, legend_position="none", xlab = "UMAP 1", ylab = "UMAP 2") +
FontSize(x.text = 10, y.text = 10, x.title = 13, y.title = 13, main = 15)
}
p = patchwork::wrap_plots(p_list, ncol=nr_cols) + patchwork::plot_annotation(title="UMAP, cells coloured by cluster identity for different resolution values")
p### Add metadata to misc
# Add colour lists for orig.dataset
col = GenerateColours(num_colours=length(levels(sc$orig.dataset)), names=levels(sc$orig.dataset), palette=param$col_palette_samples, alphas=1)
sc = ScAddLists(sc, lists=list(orig.dataset=col), lists_slot="colour_lists")
# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))
# Add parameter
Seurat::Misc(sc, "parameters") = param
# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))We export the assay data, cell metadata, clustering and visualization.
Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse
matrix)
### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
### Export count matrix
invisible(ExportSeuratAssayData(sc,
dir=file.path(param$path_out, "data", "counts"),
assays=param$assay_raw,
slot="counts",
include_cell_metadata_cols=colnames(sc[[]]),
metadata_prefix=paste0(param$project_id, ".")))
### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)
### Export annotation as excel file
openxlsx::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)], row.names=rownames(sc)) %>%
dplyr::inner_join(annot_ensembl, by="ensembl_gene_id"),
file=file.path(param$path_out, "data", "gene_annotation.xlsx"),
rowNames=FALSE, colNames=TRUE)We export the assay data, cell metadata, clustering and visualization in andata format.
Result file:
* sc.h5ad: H5AD object
# Convert Seurat v5 single cell object to anndata object
scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", )If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (Unknown 2024).
Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for
visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell
cycle phases
* Loupe_genesets.csv: Gene sets such as markers, DEGs, cell cycles
Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.
### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) {
# Export reductions (umap, pca, others)
for(r in Seurat::Reductions(sc)) {
write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
# Export categorical metadata
loupe_meta = sc@meta.data
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"),
file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets (CC genes, known markers, per-cluster markers up- and downregulated, ...)
gene_lists = Misc(sc, "gene_lists")
# Remove empty gene sets
gene_lists = gene_lists[purrr::map_int(gene_lists, length) > 0]
loupe_genesets = purrr::map_dfr(names(gene_lists), function(n) {return(data.frame(List=n, Name=gene_lists[[n]]))})
loupe_genesets$Ensembl = seurat_rowname_to_ensembl[loupe_genesets$Name]
write.table(loupe_genesets, file=file.path(param$path_out, "data", "Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}
The following parameters were used to run the workflow.
out = ScrnaseqParamsInfo(params=param)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")| Name | Value |
|---|---|
| path_to_git | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis |
| scriptname | modules/pre-processing/pre-processing.Rmd |
| author | kosankem |
| assay_raw | RNA |
| downsample_cells_equally | FALSE |
| cell_filter | sample1:nFeature_RNA=c(500, 2400), nCount_RNA=c(NA, NA), percent_mt=c(NA, 20); sample2:nFeature_RNA=c(500, 2400), nCount_RNA=c(NA, NA), percent_mt=c(NA, 20) |
| feature_filter | sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5 |
| samples_min_cells | 10 |
| norm | RNA |
| cc_remove | FALSE |
| cc_remove_all | FALSE |
| cc_rescore_after_merge | TRUE |
| integrate_samples | method:merge |
| pc_n | 20 |
| cluster_k | 20 |
| umap_k | 30 |
| cluster_resolution_test | 0.5, 0.7, 0.8 |
| cluster_resolution | 0.6 |
| annot_version | 98 |
| annot_main | ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession |
| mart_attributes | ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description |
| marker_padj | 0.05 |
| marker_log2FC | 1 |
| marker_pct | 0.25 |
| deg_contrasts | condition_column:orig.ident; condition_group1:sample1; condition_group2:sample2; subset_column:seurat_clusters; subset_group:; downsample_cells_n:50 |
| enrichr_site | Enrichr |
| enrichr_padj | 0.05 |
| col_palette_samples | ggsci::pal_lancet |
| col_palette_clusters | ggsci::pal_igv |
| col_palette_annotation | ggsci::pal_igv |
| col | #0086b3 |
| col_bg | #D3D3D3 |
| pt_size | 0.5 |
| project_id | Testdata |
| path_data | name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/ |
| species | human |
| path_out | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/pre-processing |
| debugging_mode | default_debugging |
| mart_dataset | hsapiens_gene_ensembl |
| mt | ^MT- |
| enrichr_dbs | GO_Biological_Process_2018, WikiPathways_2019_Human, KEGG_2021_Human, Azimuth_Cell_Types_2021 |
| path_reference | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98 |
| reference | hsapiens_gene_ensembl.v98.annot.txt |
| file_annot | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt |
| file_cc_genes | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx |
| col_samples | sample1=#00468BFF, sample2=#ED0000FF |
| path_test_dataset | /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/modules/download_test_datasets/.R |
This report was generated using the modules/pre-processing/pre-processing.Rmd script. Software versions were collected at run time.
out = ScrnaseqSessionInfo(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))| Name | Value |
|---|---|
| Run on: | Thu Jul 04 06:19:20 PM 2024 |
| ktrns/scrnaseq | 8d6d5f32ac6860e21ad0b1f009fe7a7d820784e3 |
| Container | NA |
| R | R version 4.2.1 (2022-06-23) |
| Platform | x86_64-pc-linux-gnu (64-bit) |
| Operating system | Ubuntu 20.04.6 LTS |
| Host name | hpc-rc11 |
| Host OS | #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic) |
| Packages | abind1.4-5, ape5.8, backports1.4.1, beeswarm0.4.0, bibtex0.5.1, bslib0.6.1, cachem1.0.8, checkmate2.3.1, circlize0.4.16, cli3.6.2, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, crayon1.5.2, curl5.2.1, data.table1.15.2, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, evaluate0.23, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, fitdistrplus1.1-11, forcats1.0.0, future1.33.1, future.apply1.11.1, generics0.1.3, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggprism1.0.5, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, GlobalOptions0.1.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gtable0.3.4, highr0.10, hms1.1.3, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, irlba2.3.5.1, janitor2.2.0, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, Matrix1.6-5, matrixStats1.1.0, memoise2.0.1, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, paletteer1.6.0, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, RANN2.6.1, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, readr2.1.5, RefManageR1.4.0, rematch22.1.2, remotes2.4.2.1, renv0.16.0, reshape21.4.4, reticulate1.35.0, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, rstudioapi0.15.0, Rtsne0.17, sass0.4.8, scales1.3.0, scattermore1.2, scCustomize2.1.2, sceasy0.0.7, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shape1.4.6.1, shiny1.8.0, snakecase0.11.1, sp2.1-3, spam2.10-0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, tzdb0.4.0, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, xfun0.41, xml21.3.6, xtable1.8-4, yaml2.3.8, zip2.3.1, zoo1.8-12 |
This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).
# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))